package com.cmge.ad.spider;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedQueue;

import org.springframework.util.StringUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import com.cmge.ad.cache.ehcache.EhCacheCacheManager;
import com.cmge.ad.model.ArticleStore;
import com.cmge.ad.model.CrawlLog;
import com.cmge.ad.service.CrawlService;
import com.cmge.ad.util.JsonUtil;
import com.cmge.ad.util.SuprUtil;
import com.cmge.ad.util.TimeUtils;
import com.cmge.ad.util.context.SpringContextUtil;

/**
 * @desc	段子列表、详情页模式抓取器
 * 
 * 		1、抓取列表
 * 		2、列表中获取具体文章的url
 * 		3、具体文章中抓取指定内容
 * 
 * 		结构拆分：
 * 		1、配置项
 * 		2、日志监控项
 * 		3、数据统计项
 * 
 * @author	ljt
 * @time	2014-12-29 上午11:16:05
 */
public class ArticleListPageProcessor implements PageProcessor {
	
	private CrawlService crawlService;
	
	private EhCacheCacheManager cacheManager;
	
	private Spider spider;
	
    private Site site = Site.me().setRetryTimes(3)
    							 .setUserAgent("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36");

    // 爬虫任务编号  和爬虫任务表主键Id一致
    private String id;
    
    // 休眠时间
    private int sleepTime;
    
    // 用来校验正则的
    private String listUrl;
    
    private String infoUrlRegular;
    
    private String listUrlRegular;
    
    // list页面url模板
    private String listUrlTemplate;
    
    // 用来校验正则的
    private String infoUrl;

    // list起始页数
    private int start;
	
    // list结束页数
	private int end;
	
	// 详情页url的xpath
	private String infoUrlXpath;
	
	// 详情页url前缀
	private String infoUrlPre;
	
	// 段子内容xpath
	private String infoContentXpath;
	
	// 段子图片xpath
	private String infoPicXpath;
	
	// 详情页重复标识字段xpath
	private String repeatFlagXpath;
	
	// 重复次数上限
	private int repeatMaxTimes;
	
	// 当前重复次数
	private int repeatTime;
	
	private Integer siteId;
	
	private Integer siteCategoryId;
	
	// list页加载开关
	private boolean flag = true;
	
	// 全局开关  重复
	private boolean repeatFlag = true;
	
	public ArticleListPageProcessor() {
		super();
		this.crawlService = SpringContextUtil.getApplicationContext().getBean(CrawlService.class);
		this.cacheManager = SpringContextUtil.getApplicationContext().getBean(EhCacheCacheManager.class);
	}

	@Override
    public void process(Page page) {
		if(repeatFlag){
			if (page.getUrl().regex(listUrlRegular).match()) {
	        	if(flag){
	        		putCache("CrawlLog", id, "+++++++++++<font style='color:red'>开始加载列表页面</font>+++++++++++");
	        		// 加载所有list页面
	        		List<String> listUrl = new ArrayList<String>();
	        		for(int i = start;i<=end;i++){
	        			String url = listUrlTemplate.replace("$1", String.valueOf(i));
	        			putCache("CrawlLog", id, "加载[<font style='color:red'>"+url+"</font>]成功");
	        			listUrl.add(url);
	        		}
	        		page.addTargetRequests(listUrl);
	        		flag = false;
	        		putCache("CrawlLog", id, "+++++++++++<font style='color:red'>加载列表页面结束</font>+++++++++++");
	        	}
	        	
	        	List<String> infoUrlList = null;
	        	// 查找当前list页的所有info页url
	        	if(!StringUtils.isEmpty(infoUrlRegular) && !infoUrlRegular.equals("\\w+")){
	        		infoUrlList = page.getHtml().xpath(infoUrlXpath).regex(infoUrlRegular).all();
	        	}else{
	        		infoUrlList = page.getHtml().xpath(infoUrlXpath).all();
	        	}
	        	
	        	if(!SuprUtil.isEmptyCollection(infoUrlList)){
	        		if(!StringUtils.isEmpty(infoUrlPre)){
	        			for(String str : infoUrlList){
	        				page.addTargetRequest(infoUrlPre + str);
	        			}
	        		}else{
	        			page.addTargetRequests(infoUrlList);
	        			putCache("CrawlLog", id, "加载[<font style='color:red'>"+page.getUrl().toString()+"</font>]详情页成功，数据大小："+infoUrlList.size());
	        		}
	        	}
	        } else {
	        	putCache("CrawlLog", id, "抓取详情页：[<font style='color:red'>"+page.getUrl().toString()+"</font>]");
	        	
	        	ArticleStore articleStore = new ArticleStore();
	        	int type = 1;
	        	// 内容
	        	String content = page.getHtml().xpath(infoContentXpath).get();
	        	if(StringUtils.isEmpty(content)){
	        		putCache("CrawlLog", id, "段子内容为空，跳过...");
	        		// 段子内容为空 则跳过
	        		page.setSkip(true);
	        		return;
	        	}
	        	
	        	// 图片
	        	String picUrl = "";
	        	if(!StringUtils.isEmpty(infoPicXpath)){
	        		picUrl = page.getHtml().xpath(infoPicXpath).get();
	        		if(!StringUtils.isEmpty(picUrl)){
	        			type = 2;
	        			if(picUrl.toLowerCase().indexOf(".gif") > -1){
	        				putCache("CrawlLog", id, "图片格式是gif，跳过...");
	        				page.setSkip(true);
	        				return;
	        			}
	        		}
	        	}
	        	
	        	// 唯一标识
	        	String uniqueId = page.getHtml().xpath(repeatFlagXpath).get();
	        	if(StringUtils.isEmpty(uniqueId)){
	        		// 唯一标识为空 则跳过
	        		putCache("CrawlLog", id, "唯一标识为空 ，跳过...");
	        		page.setSkip(true);
	        		return;
	        	}else{
	        		// 判断uniqueId是否已存在   从缓存中获取
	        		boolean flag = crawlService.isExistArticleUniqueId(uniqueId);
	        		if(flag){
	        			repeatTime++;
	        			if(repeatTime >= repeatMaxTimes){
	        				putCache("CrawlLog", id, "重复次数大于最大上限[<font style='color:red'>"+repeatMaxTimes+"</font>]，退出爬虫");
	        				// 退出爬虫
	        				repeatFlag = false;
	        				spider.stop();
	        			}else{
	        				putCache("CrawlLog", id, "唯一标识[<font style='color:red'>"+uniqueId+"</font>]已存在,当前重复次数："+repeatTime+"次");
	        				page.setSkip(true);
	        				return;
	        			}
	        		}else{
	        			articleStore.setContent(content.trim());
	    	        	articleStore.setMinImageUrl(picUrl);
	    	        	articleStore.setMaxImageUrl(picUrl);
	    	        	articleStore.setType(type);
	    	        	articleStore.setSiteId(siteId);
	    	        	articleStore.setSiteCategoryId(siteCategoryId);
	    	        	articleStore.setUniqueId(uniqueId);
	    	        	
	    	        	// 加入日志缓存中
	    	        	putCache("CrawlLog", id, "<font style='color:red'>抓取内容：</font>"+JsonUtil.toJson(articleStore));
	    	        	try {
							crawlService.addArticleStore(articleStore);
						} catch (Exception e) {
							putCache("CrawlLog", id, "<font style='color:red'>抓取内容保存异常</font>，异常信息：["+e.getMessage()+"]");
						}
	        		}
	        	}
	        }
		}
    }

	// 停止爬虫任务
	public void stopCrawl(){
		spider.stop();
	}

	/**
	 * 日志放入缓存中
	 * @param region
	 * @param key
	 * @param log
	 */
    private void putCache(String region, String key, String log) {
    	ConcurrentLinkedQueue<CrawlLog> queue = null;
    	Object obj = cacheManager.get(region, key);
    	if(obj == null){
    		queue = new ConcurrentLinkedQueue<CrawlLog>();
    	}else{
    		queue = (ConcurrentLinkedQueue<CrawlLog>)obj;
    	}
    	
    	CrawlLog crawlLog = new CrawlLog();
    	crawlLog.setLog(log);
    	crawlLog.setTime(TimeUtils.getTime());
    	// 最新的日志加载最前面
    	queue.add(crawlLog);
		cacheManager.put(region, key, queue);
	}

	public String getListUrl() {
		return listUrl;
	}

	public ArticleListPageProcessor setListUrl(String listUrl) {
		this.listUrl = listUrl;
		return this;
	}

	public String getInfoUrl() {
		return infoUrl;
	}

	public ArticleListPageProcessor setInfoUrl(String infoUrl) {
		this.infoUrl = infoUrl;
		return this;
	}

	@Override
    public Site getSite() {
        return site;
    }
    
    public String getListUrlTemplate() {
		return listUrlTemplate;
	}

	public ArticleListPageProcessor setListUrlTemplate(String listUrlTemplate) {
		this.listUrlTemplate = listUrlTemplate;
		return this;
	}

	public int getStart() {
		return start;
	}

	public ArticleListPageProcessor setStart(int start) {
		this.start = start;
		return this;
	}

	public int getEnd() {
		return end;
	}

	public ArticleListPageProcessor setEnd(int end) {
		this.end = end;
		return this;
	}

	public String getInfoUrlXpath() {
		return infoUrlXpath;
	}

	public ArticleListPageProcessor setInfoUrlXpath(String infoUrlXpath) {
		this.infoUrlXpath = infoUrlXpath;
		return this;
	}

	public String getInfoUrlPre() {
		return infoUrlPre;
	}

	public ArticleListPageProcessor setInfoUrlPre(String infoUrlPre) {
		this.infoUrlPre = infoUrlPre;
		return this;
	}

	public String getInfoContentXpath() {
		return infoContentXpath;
	}

	public ArticleListPageProcessor setInfoContentXpath(String infoContentXpath) {
		this.infoContentXpath = infoContentXpath;
		return this;
	}

	public String getInfoPicXpath() {
		return infoPicXpath;
	}

	public ArticleListPageProcessor setInfoPicXpath(String infoPicXpath) {
		this.infoPicXpath = infoPicXpath;
		return this;
	}

	public String getRepeatFlagXpath() {
		return repeatFlagXpath;
	}

	public ArticleListPageProcessor setRepeatFlagXpath(String repeatFlagXpath) {
		this.repeatFlagXpath = repeatFlagXpath;
		return this;
	}

	public int getRepeatMaxTimes() {
		return repeatMaxTimes;
	}

	public ArticleListPageProcessor setRepeatMaxTimes(int repeatMaxTimes) {
		this.repeatMaxTimes = repeatMaxTimes;
		return this;
	}

	public ArticleListPageProcessor setSite(Site site) {
		this.site = site;
		return this;
	}
	
	public Integer getSiteId() {
		return siteId;
	}

	public ArticleListPageProcessor setSiteId(Integer siteId) {
		this.siteId = siteId;
		return this;
	}

	public Integer getSiteCategoryId() {
		return siteCategoryId;
	}

	public ArticleListPageProcessor setSiteCategoryId(Integer siteCategoryId) {
		this.siteCategoryId = siteCategoryId;
		return this;
	}

	public int getSleepTime() {
		return sleepTime;
	}

	public ArticleListPageProcessor setSleepTime(int sleepTime) {
		this.sleepTime = sleepTime;
		site.setSleepTime(sleepTime);
		return this;
	}

	public int getRepeatTime() {
		return repeatTime;
	}

	public ArticleListPageProcessor setRepeatTime(int repeatTime) {
		this.repeatTime = repeatTime;
		return this;
	}

	public Spider getSpider() {
		return spider;
	}

	public void setSpider(Spider spider) {
		this.spider = spider;
	}

	public String getId() {
		return id;
	}

	public ArticleListPageProcessor setId(String id) {
		this.id = id;
		return this;
	}

	public String getInfoUrlRegular() {
		return infoUrlRegular;
	}

	public ArticleListPageProcessor setInfoUrlRegular(String infoUrlRegular) {
		this.infoUrlRegular = infoUrlRegular;
		return this;
	}

	public String getListUrlRegular() {
		return listUrlRegular;
	}

	public ArticleListPageProcessor setListUrlRegular(String listUrlRegular) {
		this.listUrlRegular = listUrlRegular;
		return this;
	}
}
